Updated: Sep 20, 2023 at 22:00:52 PDT.

Working directory: /home/groups/jgrimmer/trump_blocs.

Reads:

  • data/ipums-census/usa_00006.csv

Writes:

  • data/census_voteblocs.fst
  • data/ipums-census/census_sample.csv

Setup

rm(list = ls())

library(dplyr)
library(data.table)

Load data

census <- fread("data/ipums-census/usa_00006.csv")

Recode variables

census[ ,
        `:=`(
            year   = as.character(YEAR),
            statefips = coler::lz_pad(STATEFIP, 2),
            weight = PERWT / 100,
            urban = recode(METRO, "1" = "rural", "2" = "urban", "3" = "suburban",
                           .default = NA_character_),

            gender = recode(SEX, `1` = "male", `2` = "female",
                            .default = NA_character_),
            age = as.numeric(AGE),
            race   = recode(RACE, "1" = "white", "2" = "black",
                            .default = NA_character_),
            hisp_origin = recode(HISPAN, "1" = "mexican", "2" = "puerto rican",
                                 "3" = "cuban", "4" = "other",
                                 .default = NA_character_),
            educ = as.numeric(EDUCD) %>%
            {
                case_when(. < 64                   ~ "HS or less",
                          # Includes associate's degree
                          . %in% 65:99                  ~ "some college",
                          # Includes 4+ years of college
                          . %in% c(100:116)  ~ "college")
            },

            region = REGION %>%
              {
                case_when(. %in% 11:13  ~ "Northeast",
                          . %in% 21:23  ~ "Midwest",
                          . %in% 31:34  ~ "South",
                          . %in% 41:43  ~ "West",
                          . %in% 91:99  ~ NA_character_)
              }
        )
        ] %>% 
  filter(age >= 18)
# Recode missing values NOW for proper quantiles
census[ ,
        fincome := case_when(FTOTINC %in%
                                 c(9999998, 9999999) ~ NA_integer_,
                             # net loss
                             # FTOTINC == -1      ~ 0,
                             TRUE               ~ FTOTINC)]
census[ ,
        `:=`(fincome_anes = cut(fincome,
                                labels = c("0-16ptile", "17-33ptile", "34-67ptile",
                                           "68-95ptile", "96-100ptile"),
                                breaks = quantile(fincome,
                                                  probs = c(0, .16, .33, .67,
                                                            .95, 1),
                                                  na.rm = TRUE),
                                include.lowest = TRUE),
             faminc_quin = ntile(fincome, 5) %>%
               recode_factor(`1` = "1st",
                             `2` = "2nd",
                             `3` = "3rd",
                             `4` = "4th",
                             `5` = "5th"
               )
        ),
        by = .(year)
]
# Recode Hispanic from white
census[HISPAN %in% 1:4, race := "hispanic"]

Subset to B/W/H.

census <- dplyr::filter(census, race %in% c("black", "white", "hispanic"))

Age bin

Copied from Will’s CCES code: age groups in 10-year bands, except under 20 is counted w/ 20s

mfloor <- function(x,base){
    base * floor(x/base)
}

census[ , age_bin := mfloor(age, 10)]
## Warning in `[.data.table`(census, , `:=`(age_bin, mfloor(age, 10))): Invalid
## .internal.selfref detected and fixed by taking a (shallow) copy of the
## data.table so that := can add this new column by reference. At an earlier
## point, this data.table has been copied by R (or was created manually using
## structure() or similar). Avoid names<- and attr<- which in R currently (and
## oddly) may copy the whole data.table. Use set* syntax instead to avoid copying:
## ?set, ?setnames and ?setattr. If this message doesn't help, please report your
## use case to the data.table issue tracker so the root cause can be fixed or this
## message improved.
census[age < 20, age_bin := 20]

Add South dummy

fips <- fastLink::statefips

setDT(fips, key = "statefips")

fips[ , south := as.numeric(state %in%
                                c("TN", "VA", "NC", "SC", "FL",
                                  "GA", "AL", "MS", "LA", "AR", "TX",
                                  # Schickler adds these
                                  "OK", "KY"))
      ]
census[fips,
       c("south", "state") := .(i.south, i.state),
       on = "statefips"
       ]
sources <-
    tribble( ~ SAMPLE, ~ source,
             202001, "ACS",
             201801, "ACS",
             201601, "ACS",
             201401, "ACS",
             201201, "ACS",
             201007, "Census",
             201001, "ACS",
             200801, "ACS",
             200601, "ACS",
             200401, "ACS",
             200201, "ACS",
             200001, "Census",
             199001, "Census",
             198001, "Census",
             197002, "Census",
             197001, "Census",
             196002, "Census",
             195001, "Census")

setDT(sources, key = "source")

census[sources,
       source := i.source,
       on = "SAMPLE"]

Export

keep <- grep("[a-z]", names(census), value = TRUE)
census_out <- dplyr::select(census, all_of(keep))
setDT(census_out)

skimr::skim(census_out)
Data summary
Name census_out
Number of rows 104064467
Number of columns 17
Key NULL
_______________________
Column type frequency:
character 9
factor 3
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
year 0 1.00 4 4 0 16 0
statefips 0 1.00 2 2 0 51 0
urban 48085787 0.54 5 8 0 3 0
gender 0 1.00 4 6 0 2 0
race 0 1.00 5 8 0 3 0
hisp_origin 91474474 0.12 5 12 0 4 0
educ 29142747 0.72 7 12 0 3 0
region 0 1.00 4 9 0 4 0
source 0 1.00 3 6 0 2 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
fincome_anes 32281552 0.69 FALSE 5 34-: 24453626, 68-: 19913998, 17-: 12391111, 0-1: 11508151
faminc_quin 32281552 0.69 FALSE 5 3rd: 14450935, 2nd: 14446234, 4th: 14381098, 1st: 14347783
state 0 1.00 FALSE 51 CA: 10754176, TX: 7664636, NY: 7222638, FL: 5632367

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
weight 0 1.00 0.44 0.68 0 0.1 0.2 0.48 4.044e+01 ▇▁▁▁▁
age 0 1.00 36.60 22.98 0 17.0 35.0 54.00 1.000e+02 ▇▇▆▃▁
fincome 32281552 0.69 48278.34 64558.81 -39996 10925.0 29215.0 61500.00 3.164e+06 ▇▁▁▁▁
age_bin 0 1.00 36.51 18.27 20 20.0 30.0 50.00 1.000e+02 ▇▃▂▁▁
south 0 1.00 0.33 0.47 0 0.0 0.0 1.00 1.000e+00 ▇▁▁▁▃
fst::write_fst(census_out, "data/census_clean.fst")

# Sample to work with locally
fwrite(census_out %>% group_by(year) %>%
           sample_frac(0.001),
       "data/ipums-census/census_sample.csv")